knitr::opts_chunk$set(echo = TRUE, message = F, warning = F)
library(microbenchmark)
library(profvis)
library(ggplot2)
library(gridExtra)
library(tidyr)
library(reshape2)
library(plotly)
library(parallel)
library(pander)

"theme_piss" <- function(size_p = 18, size_c = 14, theme = theme_bw()){
  theme(plot.title = element_text(size = size_p, hjust=0.5, 
                                  colour = "#33666C", face="bold"),
        axis.title = element_text(face = "bold", size= size_c,
                                    colour = "#33666C"),
        legend.title = element_text(colour="#33666C", 
                                    size=12, face="bold")) +
    theme
}


'solar.cycle' <- function( col = 2) {
   list(
    geom_vline(xintercept = as.numeric(as.Date("1923-05-01")), col = col, size = 0.3), 
    geom_vline(xintercept = as.numeric(as.Date("1933-11-01")), col = col, size = 0.3), 
    geom_vline(xintercept = as.numeric(as.Date("1944-02-01")), col = col, size = 0.3), 
    geom_vline(xintercept = as.numeric(as.Date("1954-02-01")), col = col, size = 0.3),
    geom_vline(xintercept = as.numeric(as.Date("1964-10-01")), col = col, size = 0.3),
    geom_vline(xintercept = as.numeric(as.Date("1976-05-01")), col = col, size = 0.3),
    geom_vline(xintercept = as.numeric(as.Date("1986-03-01")), col = col, size = 0.3),
    geom_vline(xintercept = as.numeric(as.Date("1996-06-01")), col = col, size = 0.3),
    geom_vline(xintercept = as.numeric(as.Date("2008-01-01")), col = col, size = 0.3)
   )
}

setwd("/home/piss/Documents/Consu LSTAT2390 /Q2/data/matlab/MatlabR2016")
#source('smooth.gaussR_LOAD.R')  # Re-created Gaussian smoother
source('interpol_em_pissoort.R')  # Put these in your directory
#load('DataSSN.RData')   # Take full dataset
load('DataSSN.RData')  # Take data since 1981
# data.chris <- read.csv("Alldata.csv")
# 
# data.chris$Date <- as.Date(paste(data.chris$year,
#                                  data.chris$month, data.chris$day,sep = "-"))
# data.chris[data.chris == -1] <- NA
# 
# data.chris <- data.chris[, colnames(data.chris) %in% c("Ng", "Ns") == F]
# # We do it only for SSN here
# 
# data.mat <- spread(data.chris, key="station", value="SSN")
# data.mat <- data.mat[order(data.mat$decdate),]
# 
# data.mat2 <- data.mat[,-(1:5)] # Get matrix of stations in columns only.

1 Explanations of the main Function

All the functions used are available in the \(\texttt{interpol2_em_pissoort.R}\). We will focus here on the main functions, in order to not being “overflowed” of code …

# MAIN FUNCTION : 
# interpolsvd_em() fills gaps in monovariate or multivariate data
# by SVD-imputation (closely related to expectation-maximization)
# The method decomposes the data into two time scales, which are processed
# separately and then merged at the end. The cutoff time scale (nsmo) is
# expressed in number of samples. A gaussian filter is used for filtering.
# Monovariate data must be embedded first (nembed>1).
# In the initial data set, gaps are supposed to be filled in with NA !!
# Example for daily solar data with a cutoff at 81 days
# yf = interpolsvd_em(y,3,81,0,20,1);
#  
#   y       data.frame or matrix of data with gaps
#   nembed  embedding dimension, must be > 1 for a monovariate data set
#   nsmo    cutoff time scale scale (in nr of samples). Set nsmo=0 if only one
#       single time scale is desired.
#  threshold1 stops iterations after relative energy change is < threshold
#   niter   max number of iterations  
#   ncomp   number of significant components, to be specified for running
#           in automatic mode. Default (0) leads to manual selection
#   displ   used to print some results during computaton
#   method.smooth chooses you desired method function for smoothing and
#   param.smooth if it is required by the method (set = nsmo if gaussian)
# yf      same data set as y, but with gaps filled
# Ak      weight distrtibution of the SVD
'interpolsvd_em' <-  function( y, nembed = 1, nsmo = 0, ncomp = 0,
                               threshold1 = 1e-5, niter = 30, displ = F){
  time <- proc.time() # measure time for computational issues
  
  if (nembed < 1)  
    stop("Please choose nembed >1. If monovariate series, set default =1")   
  if (nsmo < 1)  
    stop("Please choose other cutoff. If 1 time scale is desired, set nsmo = 0")    
  
  Emax <- 95  # max cumulative energy (%) for selecting nr of significant components

  # detect shape of input array and transpose in order to have more
  # rows than columns (faster)
  swap <- F # Transpose if too much columns, faster! transpose back in the end
  if ( ncol(y) > 2*nrow(y) ) {   y <- t(y) ;   swap <- T  } 
  
  
  ## estimate average and standard deviation and standardise
  
  id.notNA <- apply(y, 2, function(x) which(!is.na(x)))
  obs.notNA <- as.vector(as.numeric(lapply(id.notNA, FUN = length)))
  
  # Anwsers if there is sufficient obs per station ? 
  bad.obs <- which( obs.notNA <= 1 )
  # (From now,) we don't allow stations that have only one obs. Tune it !
  ave_y <- apply(y, 2, mean, na.rm = T )  
  sd_y <- apply(y, 2, sd, na.rm = T ) # We remove NA's for this, so far
  # y0 <- (y - ave_y) / sd_y ;# print(y0[1:50,1:5])
  # y1 <- (y - ave_y %*% rep(1, ncol(y))) / (rep(1, ncol(y)) %*% sd_y) #; print(y1[1:50,1:5])
  #print(t(rep(1, ncol(y))) %*% ave_y)
  y <- sweep(sweep(y, 2, ave_y, "-"), 2, sd_y, "/")
  # Control station that are more than 1 obs
  # Fill values for station with all NA or only 1 obs
  ave_y[bad.obs] <- mean(ave_y[-bad.obs]) ; sd_y[bad.obs] <- mean(sd_y[-bad.obs])
  # In matlab they replaced by 0 and 1 but it introduced errors. 
  
  
  ## Perform some tests
  
  all.na <- sapply(y, function(x) all(is.na(x)))
  if ( any(all.na)  ) {
    cat("column(s)", names(y[all.na]), "have only missing values ! \n")
    stop('each column should have at least some valid values ')
    }
  if ( ncol(y)<2 & nembed<2 )  
    stop(' embedding dimension must be >1 for (monovariate records ')
  if ( ncomp > ncol(y)*nembed ) 
    stop(paste('number of components cannot exceed ',ncol_y*nembed))
  
  # embed records if necessary
  if(nembed>1)  x <- embedy(as.matrix(y), nembed, displ =  F)
  else   x <- y     
  
  
  ## Weigh each record according to number of their # of Na's
  # Hence, larger weight is given to records with fewer gaps
  n.NA <- apply(x, 2, function(x) sum(is.na(x)))
  weight <- (nrow(x) - n.NA) / nrow(x)
  weight <- weight / max(weight)
  weight <- weight * weight

  x <- sweep(x, 2, weight, "*")

  # for display, choose the record that contains the largest # of gaps  
  nNA <- sum(is.na(x))
  ind_gaps <- max(nNA)     
  
  # first iteration: start by filling in gaps by linear interpolation
  # (Isn't it a bit 'poor' ??)
  
  xi <- matrix(NA, nrow = nrow(x), ncol = ncol(x))
  ave_x <- rep(NA, ncol(x))
  #id.na <- apply(x, 2, function(x) which(!is.na(x)))
  xnew <- x
  for (i in 1:ncol(x)){
    w <- which(!is.na(x[,i]))
    ave_x[i] <- mean(x[w,i])   # see line 367 it is re-used
    xnew[,i] <- approx(c(0, w, nrow(x)+1), c(0, x[w,i], 0), (1:nrow(x)))$y 
    # xnew with NA's replaced by (simple) linear interpolation
    
    # Use rather approxfun ? 
    # stats::approx Interpolation Functions
    # stats::NLSstClosestX Inverse Interpolation
    # stats::spline Interpolating Splines
  }  
  print(paste("total NA is : ", nNA)) 
  
  # subtract again the mean over the stations 
  xnew <- sweep(xnew, 2, apply(xnew, 2, mean), "-")
  # Retrieve ind position of NA's for the imputations into the loop
  ind_Na <- which(is.na(x), arr.ind = T) 
  
  # first estimate the dominant mode nr 1
  iter.count <- 0 
  err <- numeric(length = niter) # store error assoc. to each # of comp. 
  while ( iter.count < niter ){
    xfit <- rank_reduce(xnew, 1)  ;    xold <- xnew
    xnew[ind_Na] <- xfit[ind_Na]  # Now fit the NA's positions with the SVD approx.
    xnew <- sweep(xnew, 2, apply(xnew, 2, mean), "-")
    
    e <- xnew[ind_Na] - xold[ind_Na]  
    err[iter.count+1]  <- sqrt( t(e) %*% e  / nNA) # no need here to create vector for err
    
    if ( err[iter.count+1]  < threshold1){ # If dominant mode is enough, we stop here
      cat(" iterations stopped at", iter.count, "for error =", err[iter.count+1]) 
      break
    }
    iter.count = iter.count + 1  # Change step ? 
  }
  #browser()
  
    # ask for number of components if necessary
  if (ncomp < 1){
    svd <- svd(xnew)
    S <- svd$d  # do the SVD ; min(n,p) by default for both sing vec.
    U <- svd$u  ;   V <- svd$v   ;   Ak <- diag(S) # singuar values of the SVD
    E <- Ak %*% Ak
    E <- 100*E/sum(E)# fraction amount of energy for each SVD mode
    nE <- length(E)
    if (displ){ 
      ncomp2 <- readline(prompt = 'number of significant components ncomp  =')
      if ( any(ncomp2>nE) )  stop(paste(' ncomp must not exceed ', nE))
      # hold on
      # semilogy(1:ncomp2,E(1:ncomp2,1),'bo-')
      # hold off
      else  ncomp2 <- 3
    }
    print(paste('using ',ncomp2,' components out of ',nE))
  } else {
    ncomp2 <- ncomp
    svd <- svd(xnew)
    S <- svd$d  ;   U <- svd$u  ;   V <- svd$v  ;   Ak <- diag(S)
  } 
  print("main loop starts")
  if (nsmo > 1){
    for (k in 2:ncomp2){  # Now consider the other modes of the SVD until ncomp.
      iter.count <- 0  
      while (iter.count < niter ){   
        # No NA's are allowed trough (gaussian) smoothing function
        xlp <- smooth_gauss(xnew, nsmo)
        xhp <- xnew - xlp     ## Why doing these steps ?? (see above dec of fun)
        xlp <- rank_reduce(xlp, k)  ;     xhp <- rank_reduce(xhp, k)
        # After having applied the smoother for all stations,
        # we reduced the rank by SVD keeping k components.
        xold <- xnew  
        xnew[ind_Na] <- xlp[ind_Na] + xhp[ind_Na]
        xnew <- sweep(xnew, 2, apply(xnew, 2, mean), "-") # average is over stations
        # and not over time ? check it
        e <- xnew[ind_Na] - xold[ind_Na]
        err[iter.count+1] <- sqrt( t(e) %*% e / nNA )   # Same as above : No need to alloc vector
        
        if (displ == T){
          print(paste('ncomp = ', k, '  iteration ', niter,' rel. error = ',
                      format(err,8)))
        }
        if (err[iter.count+1] < threshold1){   
          cat(" iterations stopped at ", niter, "with error =",
              err[iter.count+1], "\n") 
          break
        }
        niter = niter - 1
        cat("time after niter ", niter,  (proc.time() - time)[3], "sec", "\n")
      }
      if (displ == T ) cat('\n')
    }
  }else{ 
    for (k in 1:ncomp2){      iter.count <- 0 
      while (iter.count < niter  ){
       xhp <- xnew  ;   xhp <- rank_reduce(xhp, k)
       xold <- xnew   ;    xnew[ind_Na] <- xhp[ind_Na]
      
       xnew <- sweep(xnew, 2, apply(xnew, 2, mean), "-")
       
       e <- xnew[ind_Na] - xold[ind_Na]
       err[iter.count+1] <- sqrt(t(e) %*% e/nNa)
      
       if (displ == T){
         cat('ncomp =  ', k,' iteration ', iter.count,
             '  rel. error = ',round(err,8))
       }
       if (err[iter.count+1] < threshold1) break
       niter = niter - 1
       cat("time after niter ", niter, "",  (proc.time() - time)[3], "sec", "\n")
     }
    }
   }  
  # recompose the data by adding the mean
  for (i in 1:ncol(x)){ # As nr of columns is ~low, not important to vectorize 
    w <- which(!is.na(x[,i]), arr.ind = T)
    xnew[,i] <- xnew[,i] / weight[i]
    xnew[,i] <- xnew[,i] - mean(xnew[w,i]) + ave_x[i]
  }
  # w <- apply(y, 2, function(x) which(!is.na(x), arr.ind = T))
  # n.obs <- as.data.frame( lapply(id.na, FUN = length) ) 
  # xnew <- apply (x, 2, function(x) x / weight ) 
  
  # de-embed the data
  if (nembed > 1)  yf <- deembedy(xnew, ncol(y), 1, 0)
  else  yf <- xnew
  
  # restore mean and stdev
  for (i in 1:ncol(y)) {  # Number of cols (stations) is still low 
    yf[,i] <- yf[,i] * sd_y[i] + ave_y[i] 
  }
  #apply( yf, 2, function(x) x * sd_y + ave_y)
  
  if (swap)  yf <- t(yf)
  
  cat("Total time elapsed is", (proc.time() - time)[3], "sec")
  
  return(list(y.filled = yf,
              w.distSVD = Ak,
              errorByComp = err))
}

2 Run the algorithms : Tests

To do this, we will We ran the algorithm three times :

Moreover, comparisons between the results provided by the different time spans could be interesting. We could for example train a model using data from 1981 and then validate it using previous period, in order to have an independent measure of accuracy (…)

and once in for the unmodified SSN datase We take

## First, we discard stations that have coverage <5% for reliability issues

# If keep these values, the results will have incorrect values...
notNa.sum <- apply(data.mat2, 2, function(x) sum(!is.na(x)))
is_less5_cov <- unname(notNa.sum) < .05 * nrow(data.mat2)
data.mat2.fin <- data.mat2[, !is_less5_cov] # 7 stations are discarded !! 

#data.mat2.fin <- data.mat2.fin[data.mat$decdate >= 1981, ] # take only from 1981 ?


# Take this for input, as advised in the test.m file 
y <- sqrt(data.mat2.fin+1) # Select randomly here, for testing


options(mc.cores=parallel::detectCores()) # all available cores

z <- interpolsvd_em(y, nembed = 2, nsmo = 81, ncomp = 4,
                      niter = 30, displ = F)
# 193 sec for the whole dataset (with some stations discarded)

z <- z$y.filled

zssn <- interpolsvd_em(data.mat2.fin, nembed = 2, nsmo = 81, ncomp = 4,
                        niter = 30, displ = F)
zssn <- zssn$y.filled
rownames(y) <- rownames(zssn) <- 1:nrow(y)
y1 <- cbind(as.data.frame(data.mat2.fin), Date = data.mat$Date, x = "Raw")

z1 <- cbind(as.data.frame(zssn), Date = data.mat$Date, x = "Filled")
colnames(y1) <- colnames(z1) <- c(colnames(data.mat2.fin), "Date", "x")

zz <- rbind(y1,z1)

2.1 Comparisons with raw data

First of all, we decide to show only the station of Uccle which is considered to some extent, as the “main station”. In this dynamic plot provided by \(\texttt{plotly}\), you can hide the class you would like to see, e.g. to see raw data you can click on both red and blue button. This plot can conveniently show us, among other things, how the filled alues “fits” the shape of the raw data.

zz_neg_ucc <- data.frame( wnUC2 = zz$wnUC2, Date = zz$Date, x = zz$x, negative = zz$wnUC2 < 0 )
#z1$x <- ifelse(z1$wnUC2 < 0, "Filled <0", "Filled" ) 

#zz_neg_ucc[!is.na(zz_neg_ucc$x == "Filled"), ][,"x"] <- NA
## Visualize "incorrect" filled values, ie those below 0 
# zz_neg_ucc[!is.na(zz_neg_ucc$x == "Filled"), ][ ,"x"] <- ifelse(zz_neg_ucc[!is.na(zz_neg_ucc$x != "Raw"), ][ ,"wnUC2"] < 0, "Filled <0", "Filled" ) 
zz_neg_ucc[(nrow(zz_neg_ucc)/2+1):nrow(zz_neg_ucc), ][ ,"x"] <- ifelse(zz_neg_ucc[(nrow(zz_neg_ucc)/2+1):nrow(zz_neg_ucc), ][ ,"wnUC2"] < 0, as.factor("Filled <0"), as.factor("Filled") ) 
zz_neg_ucc[is.na(zz_neg_ucc$x),][,'x'] <- "Filled <0"
# zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$x  <- ifelse( zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$wnUC2 < 0, "Filled < 0", "Filled" )


# for (i in 1:(nrow(zz)/2) ){ 
#   if ( zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$wnUC2[i] < 0 )
#     zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$x[i] <- "Filled <0"
#   
#  # else  zz_neg_ucc[zz_neg_ucc$x == "Filled", ]$x[i] <- "Filled" 
# }


f <- list(
  family = "Calibri",
  size = 16,
  color = "#33666C"
)
f2 <- list(
  family = "Old Standard TT, serif",
  size = 14,
  color = "black"
)
xl <- list(
  title = " Date",
  titlefont = f,
  tickfont = f2,
   showticklabels = TRUE
)
yl <- list(
  title = "SSN for wnUC2",
  titlefont = f,
  tickfont = f2,
  showticklabels = TRUE
)
tit <- list(
  title = "Dynamic plot of the SSN for  the station of Uccle",
  titlefont = f
)
l <- list(
  font = list(
    family = "sans-serif",
    size = 13,
    color = "#000"),
  bgcolor = "#E2E2E2",
  bordercolor = "#FFFFFF",
  borderwidth = 2)

plot_ly(data = zz_neg_ucc, x = ~Date, y = ~`wnUC2`, type = "scattergl", color = ~x, colors = c("green", "blue", "red")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)# %>% add_data(plot_ly(data = zz_neg_ucc[zz_neg_ucc$negative.wnUC2,], x = ~Date, y = ~`wnUC2`, type = "scattergl", colors = c("green"))) #%>% add_trace()%>% add_markers(color = ~as.factor(negative.wnUC2)) 


\(\underline{\text{Comments}}\) :

  • We see that there seem to be some problems concerning the positivity of the values.

For convenience, we decide to show now the two retained stations that has the greatest number of available values, i.e. 21465 (%) and 20855 (%) for wnKS2 and wnKZ2 respectively, and also the two stations with the lowest number of avialable values

notNa.sum <- apply(data.mat2.fin, 2, function(x) sum(!is.na(x)))
#sort(notNa.sum)/nrow(data.mat2.fin) * 100

df <- data.frame(wnKS2 = "21465 (64.4%)", wnKZ2 = "18957 (56.9%) ", `wnFR-S` = "2119 (6.4%)", `wnGU-S` = "2185 (6.6%)" )
rownames(df) <- c("# available values ")
pander(df)
  wnKS2 wnKZ2 wnFR.S wnGU.S
# available values 21465 (64.4%) 18957 (56.9%) 2119 (6.4%) 2185 (6.6%)
yl$title <- "SSN for wnFR-S"
p1 <- plot_ly(data = zz, x = ~Date, y = ~`wnFR-S`, type = "scattergl", color = ~x, colors = c("red", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)

yl$title <- "SSN for wnGU-S"
p2 <- plot_ly(data = zz, x = ~Date, y = ~`wnGU-S` , type = "scattergl", color = ~x, colors = c("red", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l) 

yl$title <- "SSN for wnKS2"
p3 <- plot_ly(data = zz, x = ~Date, y = ~wnKS2, type = "scattergl", color = ~x, colors = c("red", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l) 

yl$title <- "SSN for wnKZ2"
p4 <- plot_ly(data = zz, x = ~Date, y = ~wnKZ2, type = "scattergl", color = ~x, colors = c("red", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l) 

subplot(p1, p2, nrows = 1,shareY = T, titleY = T )

For these two stations, we remark that the small number of available values indeed impact the uncertainty and the way these missing values are filled by the algorithm.

In contrast, when the number of available values is high \(\Rightarrow\)

subplot(p3, p4, nrows = 1,shareY = T, titleY = T)

The filling seem quite more homogenous and thus robust.

# z1 <- cbind(as.data.frame(zz[nrow(data.mat),]), Date = data.mat$Date )
# colnames(z1) <- c(colnames(data.mat2.fin), "Date")
# zz.full <- melt(z1, id.vars = "Date" )
# dimnames(zz.full)[[2]][2] <- "Station"
# dimnames(zz.full)[[2]][3] <- "SSN"

g <- ggplot(zz.full) + geom_line(aes( x = Date, y = SSN, col = Station), size = .4) +
 scale_color_hue(l = 40, c = 200) + theme_piss()
  # scale_colour_brewer(name=expression(underline(bold("data: "))),
  #                     palette = "Set1") + 
  #guides(colour = guide_legend(override.aes = list(size= 2.5))) 

# ggplotly(g)

plot_ly(data = zz.full, x = ~Date, y = ~SSN, type = "scattergl", color = ~Station )

We can first notice a “problem”, that is the values for SSN which are below 0

Could’nt we

2.2 Comparisons with other filling methods

2.3 Filling method of “Vero”, time series starting from 1981

1. Are the non-missing values still the same ?

We hope so, but we will check it.

filledt.vero <- read.csv("time_SSNf_1981.csv")

load("DataSSN81.RData") # Load data we have build

colnames(zssn) <- colnames(z1[, -ncol(z1)])
zssn <- cbind.data.frame(zssn, times = z1$Date)

## add indicators for missingness
missings <- is.na(y)
colnames(missings) <- paste0("miss_", colnames(missings))
zssn_complete <- cbind.data.frame(zssn, missings)
sum(zssn_complete$miss_wnUC2)
## [1] 3914
head(dif <- zssn_complete$wnUC2[!zssn_complete$miss_wnUC2] - data.mat2.fin$wnUC2[!zssn_complete$miss_wnUC2] )
## [1] 0.000000000 0.002434213 0.002434213 0.002434213 0.002434213 0.002434213
sum (dif) / 0.0024342
## [1] 8598.047

The difference actually the same for all the observations. this not really an intrinsic difference but rather a variable type difference; the first being integer and the latter being real (numeric).

2. Comparisons with all data (missing & non-missing)

# df_uccle_vero <- data.frame(time = zssn$times[1:nrow(filledt.vero)], wnUC2 = filledt.vero$UC2, from = "Vero")
# 
# df_uccle_v <- rbind.data.frame(df_uccle_chris, data.frame(time = zssn$times[1:nrow(filledt.vero)], wnUC2 = zssn$wnUC2[1:nrow(filledt.vero)], from = "`interpolEM`"))
# 
# plot_ly(data = df_uccle_v, x = ~time, y = ~`wnUC2`, type = "scattergl", color = ~from, colors = c("green", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)


df_uccle_vero <- data.frame(time = z1$Date[1:nrow(filledt.vero)], SSN_wnUC2 = filledt.vero$UC2, from = "Vero")

df_uccle_c <- rbind.data.frame(df_uccle_vero, data.frame(time = z1$Date[1:nrow(df_uccle_vero)],
                                                         SSN_wnUC2 = zssn[,"wnUC2"][1:nrow(df_uccle_vero)], from = "`interpolEM`"))

ggplot(df_uccle_c, aes(col = from)) + geom_point(aes(x = time, y = SSN_wnUC2), size = 0.15) + 
  solar.cycle() + theme_piss() +  guides(colour = guide_legend(override.aes = list(size= 3))) + ggtitle("Filled missing values")

Analyze residuals

resdf_vero <- data.frame(time = z1$Date[1:nrow(filledt.vero)],
                         res = (filledt.vero$UC2) - (zssn[,"wnUC2"][1:nrow(df_uccle_vero)]) ) 
max(resdf_vero$res)  ; min(resdf_vero$res)
## [1] 478.9861
## [1] -239.0024
library(RColorBrewer)
myPalette <- colorRampPalette(rev(brewer.pal(4, "Spectral")))

ggplot(resdf_vero, aes(x = time, y = res, col = res)) + geom_point( size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))

Or we could also visualize the discrepancies between the filled values

Only for the filled data

filled_ssn <- zssn[,"wnUC2"][1:nrow(df_uccle_vero)][zssn_complete$miss_wnUC2]

resdf_vero <- data.frame(time = z1$Date[1:nrow(filledt.vero)][zssn_complete$miss_wnUC2],
                         res = (filledt.vero$UC2[zssn_complete$miss_wnUC2]) - filled_ssn ) 
max(resdf_vero$res)  ; min(resdf_vero$res)
## [1] NA
## [1] NA
myPalette <- colorRampPalette(rev(brewer.pal(4, "Spectral")))

# ggplot(resdf_vero, aes(x = time, y = res, col = res)) + geom_point( size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))
ggplot(resdf_vero, aes(col = res)) + geom_line(aes(x = time, y = res), size = 0.35)  + theme_piss() + solar.cycle() +
  scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))

Filling method of C.Ritter, time series starting from 1960

Again, we will take th station of Uccle for reference.

# filledt.chris <- read.csv("SSNfilledChris.csv") # Load data
# 
# load("DataSSN60.RData") # Load data we have build
# 
# filled.chris2 <- filledt.chris[,-2]
# 
# df_uccle_chris <- data.frame(time = filled.chris2$times, wnUC2 = filled.chris2$wnUC2_d.txt, from = "Chris")
# 
# df_uccle_c <- rbind.data.frame(df_uccle_chris, data.frame(time = filled.chris2$times, wnUC2 = zssn$wnUC2[1:nrow(df_uccle_chris)], from = "`interpolEM`"))
# 
# plot_ly(data = df_uccle_c, x = ~time, y = ~`wnUC2`, type = "scattergl", color = ~from, colors = c("green", "blue")) %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)

filledt.chris <- read.csv("SSNfilledChris.csv") # Load data

load("DataSSN60.RData") # Load data we have build

filled.chris2 <- filledt.chris[,-2]

## Are the non-missing values still the same ?? 
## add indicators for missingness
missings <- is.na(y)
colnames(missings) <- paste0("miss_", colnames(missings))
zssn_complete <- cbind.data.frame(zssn, missings)
sum(zssn_complete$miss_wnUC2)
## [1] 6056
head(dif <- zssn_complete$wnUC2[!zssn_complete$miss_wnUC2] - data.mat2.fin$wnUC2[!zssn_complete$miss_wnUC2] )
## [1]  0.000000e+00 -2.842171e-14 -5.684342e-14  0.000000e+00 -2.842171e-14
## [6]  0.000000e+00
sum (dif) / 0.0024342
## [1] -2.138459e-08

Same idea. The very slight discrepancies are du to (type) approximations

df_uccle_chris <- data.frame(time = filled.chris2$times, wnUC2 = filled.chris2$wnUC2_d.txt, from = "Chris")

df_uccle_c <- rbind.data.frame(df_uccle_chris, data.frame(time = filled.chris2$times, wnUC2 = zssn$wnUC2[1:nrow(df_uccle_chris)], from = "`interpolEM`"))


# Comparisons for first 5 years (1980 : 1985)
plot_ly(data = df_uccle_c[1980 < df_uccle_c$time & df_uccle_c$time < 1985 ,], x = ~time, y = ~`wnUC2`, color = ~from, type = "scattergl", colors = c("green", "blue"), mode =  "markers+text") %>% layout(xaxis = xl, yaxis = yl, title = tit, legend = l)
## Residuals

resdf_chris <- data.frame(time = z1$Date[1:nrow(filled.chris2)],
                         res = (filled.chris2$wnUC2_d.txt) - (zssn[,"wnUC2"][1:nrow(filled.chris2)]) ) 
max(resdf_chris$res)  ; min(resdf_chris$res)
## [1] 87.31913
## [1] -94.84829
# ggplot(resdf_chris, aes(x = time, y = res, col = res)) + geom_point( size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-95, 88))
ggplot(resdf_chris, aes(col = res)) + geom_line(aes(x = time, y = res), size = 0.35)  + theme_piss() + solar.cycle() +
  scale_colour_gradientn(colours = myPalette(10),limits=c(-95, 88))

Only for filled values :

filled_ssn <- zssn[,"wnUC2"][1:nrow(df_uccle_chris)][zssn_complete$miss_wnUC2]

resdf_chris <- data.frame(time = z1$Date[1:nrow(filled.chris2)][zssn_complete$miss_wnUC2],
                         res = (filled.chris2$wnUC2_d.txt[zssn_complete$miss_wnUC2]) - filled_ssn ) 
max(resdf_chris$res)  ; min(resdf_chris$res)
## [1] NA
## [1] NA
myPalette <- colorRampPalette(rev(brewer.pal(4, "Spectral")))

# ggplot(resdf_chris, aes(x = time, y = res, col = res)) + geom_point( size = 0.35) + theme_piss() + solar.cycle() + scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))
ggplot(resdf_chris, aes(col = res)) + geom_line(aes(x = time, y = res), size = 0.35)  + theme_piss() + solar.cycle() +
  scale_colour_gradientn(colours = myPalette(10), limits=c(-150, 250))

Discrepancies are less than with Vero’s method…

We could want want to pick all the time or station that have big discrepancies. This could be informative

# rownames(zssn) <- data.mat$decdate
# zssn <- as.data.frame(zssn) 
# colnames(zssn) <- colnames(data.mat2.fin) 
# write.csv(zssn, "SSNfillFULL_R.csv")